# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
# load in the dataset into a pandas dataframe
df = pd.read_csv('2017-fordgobike-tripdata.csv')
df.head()
df.info()
df.describe()
# convert dates to date type
df['start_time'] = pd.to_datetime(df.start_time)
df['end_time'] = pd.to_datetime(df.end_time)
# convert ids from integer to string
df.start_station_id = df.start_station_id.astype(str)
df.end_station_id = df.end_station_id.astype(str)
df.bike_id = df.bike_id.astype(str)
Due to the relevance of the user information to get some insights and since is the only information containing null values, I decided to drop the rows with null values in user birth year or gender.
df.dropna(inplace=True)
# convert user birth year from float to integer
df.member_birth_year = df.member_birth_year.astype(int)
df.info()
After cleaning it, is a dataset containing 453159 bike trips all made in the year 2017. Every trip is described with information about the time, the start and end stations, the bike and the user.
Time:
- Duration in seconds
- Start time
- End Time
Stations:
- Start station id
- Start station name
- Start station latitude
- Start station longitude
- End station id
- End station name
- End station latitude
- End station longitude
Bike:
- Bike id
User:
- User type
- User birth year
- User gender
The most relevant information is the user information and the start end trip time and location.
The user data can provide interesting information about demography in bike use. In the other hand start and end information could give good information about the time when the service is more used and the zones of the city where the demand is higher.
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
First let's look at the duration of trips.
intervals = np.arange(0, df['duration_sec'].max() + 60, 60)
plt.figure(figsize=[10, 5])
plt.hist(data = df, x = 'duration_sec', bins = intervals)
plt.xlabel('Duration [s]')
plt.show()
This graphic shows that there are outliers, so let's filter them to see beter what happens with this variable
intervals = np.arange(0, df['duration_sec'].max() + 2500, 2500)
plt.figure(figsize=[10, 5])
plt.hist(data = df, x = 'duration_sec', bins = intervals)
plt.xlabel('Duration [s]')
plt.show()
Looking at the histogramme with thicker intervals we can see that the most of the trips are under 10000 seconds. So let's apply this filter to see a tendency.
filtered_df = df[df['duration_sec'] < 10000]
intervals = np.arange(0, filtered_df['duration_sec'].max() + 60, 60)
plt.figure(figsize=[10, 5])
plt.hist(data = filtered_df, x = 'duration_sec', bins = intervals)
plt.xlabel('Duration [s]')
plt.show()
We can even look closer under 3000 seconds.
filtered_df = df[df['duration_sec'] < 3000]
intervals = np.arange(0, filtered_df['duration_sec'].max() + 60, 60)
plt.figure(figsize=[10, 5])
plt.hist(data = filtered_df, x = 'duration_sec', bins = intervals)
plt.xlabel('Duration [s]')
plt.show()
And now it is clear that the histogram is skewed to the right, and the bike trips tend to be concentrated between 1 and 15 minutes approx.
Now let's look at the users information. Let's start with gender.
sb.countplot(data=df, x='member_gender')
This shows that the service is used for more men than women.
the user type:
sb.countplot(data=df, x='user_type')
Here we can see that the service is widely more used by subscribers than other customers.
... and the birth year of the members.
intervals = np.arange(df['member_birth_year'].min(), df['member_birth_year'].max() + 5, 5)
plt.figure(figsize=[10, 5])
plt.hist(data = df, x = 'member_birth_year', bins = intervals)
plt.xlabel("User's Birth Year [s]")
plt.show()
The distribution here is skewed to the left and as expected, the people under 40 years old are more likely to use the service.
Now let's take a look at the start date and see it by month, hour of the day and day of the week.
df.start_time.groupby(df["start_time"].dt.month).count().plot(kind="bar")
df.start_time.groupby(df["start_time"].dt.hour).count().plot(kind="bar")
df.start_time.groupby(df["start_time"].dt.weekday).count().plot(kind="bar")
The last three graphics show that:
The ending time was not plot assuming that most of the trips are short and finish some minutes after started.
Looking at the bike ids, even when it might seem like a not very useful feature, it can show us how homogene the system is used, and see if there are bikes that are considerably more used than others.
df.bike_id.groupby(df.bike_id).count().hist()
This histogram shows that not all the bikes are equally used.
We can do the same with the stations ids and see the distribution of the system use among the stations.
df.start_station_id.groupby(df.start_station_id).count().hist()
df.end_station_id.groupby(df.end_station_id).count().hist()
Last two graphics show that the use of the stations is also not very homogene.
Finally we can look at the position stations to see the distribution of the rides in north-south or east-west.
df.describe()
intervals_start = np.arange(df['start_station_latitude'].min(), df['start_station_latitude'].max() + 0.01, 0.01)
intervals_end = np.arange(df['end_station_latitude'].min(), df['end_station_latitude'].max() + 0.01, 0.01)
plt.figure(figsize=[10, 5])
plt.hist(data = df, x = 'start_station_latitude', bins = intervals_start, alpha=0.5)
plt.hist(data = df, x = 'end_station_latitude', bins = intervals_end, alpha=0.5)
plt.xlabel("station latitude")
plt.show()
intervals_start = np.arange(df['start_station_longitude'].min(), df['start_station_longitude'].max() + 0.01, 0.01)
intervals_end = np.arange(df['end_station_longitude'].min(), df['end_station_longitude'].max() + 0.01, 0.01)
plt.figure(figsize=[10, 5])
plt.hist(data = df, x = 'start_station_longitude', bins = intervals_start, alpha=0.5)
plt.hist(data = df, x = 'end_station_longitude', bins = intervals_end, alpha=0.5)
plt.xlabel("station longitude")
plt.show()
Here we can see some kind of zones within the city and apparently most of the bikes stay in their zones.
The main findings are summarized as follows: Duration of rides: Skewed right with most of the trips under 30 minutes. Users: Mainly male Mostly subscribers And mostly young adults Dates: Autum has more use. 8 and 17 are peak hours. In weekends the system is less used. Distribution of the use of the system: Not uniform for bikes and stations Geography: There are some zones and bikes mostly stay in their zones.
I needed to filter the duration of the trips because there were outliers with values too high and in an amount too low comparing with the rest of the data set, and it was producing non clear visualizations.
Beside filtering the duration times, I changed some data types, like ids were transformed from int to str, the birth years of the users were from float to int and the dates from str to datatype. This helps to build graphics more accurately.
Let's start looking at the correlations between pairs of data for numerical variables.
df.head(1)
variables = ['duration_sec',
'start_station_latitude',
'start_station_longitude',
'end_station_latitude',
'end_station_longitude',
'member_birth_year']
plt.figure(figsize = [8, 5])
sb.heatmap(df[variables].corr(), annot = True, fmt = '.4f',
cmap = 'coolwarm', center = 0)
plt.show()
The only variables that seem to have some strong correlation are the coordinates of the station positions.
variables = ['start_station_latitude',
'start_station_longitude']
samples = np.random.choice(filtered_df.index, 500, replace = False)
rides_sample = filtered_df.loc[samples,:]
g = sb.PairGrid(data = rides_sample, vars = variables, height = 5)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter, marker='o', alpha=0.2)
Here we can see those zones that I mentioned before, apparently there are three big zones for this service in San Francisco. Looking at the map it seem to be San Francisco, Oakland and San Jose.
It would be interesting to visualize a flowchart to see where are the bikes moving to and what are the routes with higher demand. In order to do this it would be useful to separate the trips in 4 categories:
Later it would be possible to compare age and gender of users within the cities and in the long trips try to know what kind of people is doing them.
#Let's create the zones of stations SF, OA, SJ
sf_max_long = -122.35
oa_max_long = -122
start_zone = []
end_zone = []
for k, row in df.iterrows():
if row['start_station_longitude'] < sf_max_long:
start_zone.append('SF')
elif row['start_station_longitude'] < oa_max_long:
start_zone.append('OA')
else:
start_zone.append('SJ')
if row['end_station_longitude'] < sf_max_long:
end_zone.append('SF')
elif row['end_station_longitude'] < oa_max_long:
end_zone.append('OA')
else:
end_zone.append('SJ')
df['start_zone'] = start_zone
df['end_zone'] = end_zone
df.head()
# let's see now if there are actually long trips intercities
len(df[df['start_zone'] != df['end_zone']])
df['route'] = df['start_station_id'] + '-' + df['end_station_id']
#Let's make now dataframes per zone in order to compare them and visualize things per type of trip
intercities = df[df['start_zone'] != df['end_zone']]
local = df[df['start_zone'] == df['end_zone']]
sanFrancisco = local[local['start_zone'] == 'SF']
oakland = local[local['start_zone'] == 'OA']
sanJose = local[local['start_zone'] == 'SJ']
We will use them later
Now let's see if the duration of the trips changes with the age of people and the gender.
df.groupby('member_gender').duration_sec.mean()
filtered_df.groupby('member_gender').duration_sec.mean()
In average the tips of woman are longer than trips of men, even after filtering the longest trips women have a longer average but with a very smaller difference. Apparently the longest trips are taken mostly by women.
df.groupby(pd.cut(df["member_birth_year"], np.arange(min(df["member_birth_year"])-6, max(df["member_birth_year"]), 10))).duration_sec.mean().plot(kind='bar')
filtered_df.groupby(pd.cut(filtered_df["member_birth_year"], np.arange(min(filtered_df["member_birth_year"])-6, max(filtered_df["member_birth_year"]), 10))).duration_sec.mean().plot(kind='bar')
Looking at the average duration of rides per decade of birth of the users it seems to be quite independent, even filtering the high outliers of duration.
So far it has been possible to observe some kind of segmentation fo the rides by zone, which seems to be quite normal considering that the system includes three cities. In my opinion the most interesting observation could be the flow of trips within the cities. Try to visualize how the people is moving during the day and where are the areas of higher demand.
I jut checked that there is no much relationship between age and duration of the trips. I also saw that women use more the system than men (in average).
Let's visualize the lines that connect stations in the trips and try to identify the routes that are the most used. And also see how it changes during the day and per city.
samples = np.random.choice(sanFrancisco.index, 5000, replace = False)
rides_sample = sanFrancisco.loc[samples,:]
y = [rides_sample['start_station_latitude'], rides_sample['end_station_latitude']]
x = [rides_sample['start_station_longitude'], rides_sample['end_station_longitude']]
plt.plot(x, y, color='b', alpha=0.02, marker='.');
samples = np.random.choice(oakland.index, 5000, replace = False)
rides_sample = oakland.loc[samples,:]
y = [rides_sample['start_station_latitude'], rides_sample['end_station_latitude']]
x = [rides_sample['start_station_longitude'], rides_sample['end_station_longitude']]
plt.plot(x, y, color='b', alpha=0.02, marker='.');
samples = np.random.choice(sanJose.index, 5000, replace = False)
rides_sample = sanJose.loc[samples,:]
y = [rides_sample['start_station_latitude'], rides_sample['end_station_latitude']]
x = [rides_sample['start_station_longitude'], rides_sample['end_station_longitude']]
plt.plot(x, y, color='b', alpha=0.02, marker='.');
Here we can see that every city has specific places with high demand of bikes. Now what could be interesting is to see where are the bikes going during the day in order to see if works and schools as common destination places are well distributed or rather centralized.
First for San Francisco
# separate the city database in morning and afternoon
morning = sanFrancisco[sanFrancisco["start_time"].dt.hour < 12]
afternoon = sanFrancisco[sanFrancisco["start_time"].dt.hour >= 12]
# plotting morning
samples = np.random.choice(morning.index, 1000, replace = False)
rides_sample = morning.loc[samples,:]
ig, ax = plt.subplots(nrows = 2, figsize = [25,35])
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[0].set_title('Morning Rides')
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[0].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[0].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=10 * size)
ax[0].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=10 * size)
#plotting afternoon
samples = np.random.choice(afternoon.index, 1000, replace = False)
rides_sample = afternoon.loc[samples,:]
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[1].set_title('Afternoon Rides')
ax[1].set_xlabel('longitude')
ax[1].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[1].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[1].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=10 * size)
ax[1].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=10 * size)
Oakland
# separate the city database in morning and afternoon
morning = oakland[oakland["start_time"].dt.hour < 12]
afternoon = oakland[oakland["start_time"].dt.hour >= 12]
# plotting morning
samples = np.random.choice(morning.index, 1000, replace = False)
rides_sample = morning.loc[samples,:]
ig, ax = plt.subplots(nrows = 2, figsize = [25,35])
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[0].set_title('Morning Rides')
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[0].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[0].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=8 * size)
ax[0].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=8 * size)
#plotting afternoon
samples = np.random.choice(afternoon.index, 1000, replace = False)
rides_sample = afternoon.loc[samples,:]
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[1].set_title('Afternoon Rides')
ax[1].set_xlabel('longitude')
ax[1].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[1].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[1].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=8 * size)
ax[1].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=8 * size)
San Jose
# separate the city database in morning and afternoon
morning = sanJose[sanJose["start_time"].dt.hour < 12]
afternoon = sanJose[sanJose["start_time"].dt.hour >= 12]
# plotting morning
samples = np.random.choice(morning.index, 1000, replace = False)
rides_sample = morning.loc[samples,:]
ig, ax = plt.subplots(nrows = 2, figsize = [25,35])
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[0].set_title('Morning Rides')
ax[0].set_xlabel('longitude')
ax[0].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[0].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[0].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=8 * size)
ax[0].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=8 * size)
#plotting afternoon
samples = np.random.choice(afternoon.index, 1000, replace = False)
rides_sample = afternoon.loc[samples,:]
routes_count = rides_sample.groupby('route')['duration_sec'].count()
routes_x1 = rides_sample.groupby('route')['start_station_longitude'].mean()
routes_x2 = rides_sample.groupby('route')['end_station_longitude'].mean()
routes_y1 = rides_sample.groupby('route')['start_station_latitude'].mean()
routes_y2 = rides_sample.groupby('route')['end_station_latitude'].mean()
ax[1].set_title('Afternoon Rides')
ax[1].set_xlabel('longitude')
ax[1].set_ylabel('latitude')
for i in range(len(routes_count)):
y = [routes_y1[i], routes_y2[i]]
x = [routes_x1[i], routes_x2[i]]
size= routes_count[i]
ax[1].plot(x, y, color='b', alpha=0.3, linewidth=5 * size / max(routes_count))
ax[1].plot(x[0], y[0], color='g', alpha=0.2, marker='o', markersize=8 * size)
ax[1].plot(x[1], y[1], color='r', alpha=0.2, marker='o', markersize=8 * size)
Last six graphics show us that in every city exist a flow of bikes tending to some points in the morning and to the other ways in the afternoon. They also allow us to see the main zones of every city.
In this part of the analysis became interesting to visualize how the routes are distributed within every city. It was possible to identify bussy points and some tendencies in the morning flow oposed to the afternoon flow.
The most interesting observation is the exangable flow of bikes in the morning and afternoon which help to understand how people is moving in the city and where could the system reinforced in certain hours of the day.